/*************************************************************************
 * The contents of this file are subject to the MYRICOM MYRINET          *
 * EXPRESS (MX) NETWORKING SOFTWARE AND DOCUMENTATION LICENSE (the       *
 * "License"); User may not use this file except in compliance with the  *
 * License.  The full text of the License can found in LICENSE.TXT       *
 *                                                                       *
 * Software distributed under the License is distributed on an "AS IS"   *
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See  *
 * the License for the specific language governing rights and            *
 * limitations under the License.                                        *
 *                                                                       *
 * Copyright 2003 - 2005 by Myricom, Inc.  All rights reserved.          *
 *************************************************************************/

static const char __idstring[] = "@(#)$Id: mx_iokit.cpp,v 1.70 2006/12/04 15:17:17 gallatin Exp $";
extern "C" {
#include "mx_arch.h"
#include "mx_misc.h"
#include "mx_instance.h"
#include "mx_malloc.h"
#include "mx_pio.h"
#include "mx_peer.h"
#include <sys/sysctl.h>
}
SInt32 mx_max_pinned_pages;
long mx_pin_page_nospace;
extern "C" {
SYSCTL_NODE(_net, OID_AUTO, mx, CTLFLAG_RW, 0, "MX");
SYSCTL_LONG(_net_mx, OID_AUTO, max_pinned_pages, CTLFLAG_RW,
	    &mx_max_pinned_pages, "max pinned pages remaining");
SYSCTL_LONG(_net_mx, OID_AUTO, pin_page_nospace, CTLFLAG_RW, 
	   &mx_pin_page_nospace, "pins failed due to memory limit exceeded");
}


#include <IOKit/pci/IOPCIDevice.h>
#include <IOKit/IODeviceMemory.h>
#include <IOKit/IOInterruptEventSource.h>
#include <IOKit/IOTimerEventSource.h> 
#include <UserNotification/KUNCUserNotifications.h>
#include <IOKit/IOMapper.h>

#include "mx_driver.h" 

/* This is used to serialize the driver attach.  MacOSX guarantees that
   C++ constructors are called before driver's ::start and ::free routines,
   so we can use this fact to initialize a lock. Idea from gvdl@apple.com */

class mx_driver_attach
{ 
public: 
     IOLock *lock; 
     mx_driver_attach() { 
         lock = IOLockAlloc(); 
     }; 
     ~mx_driver_attach() { 
         if (lock) IOLockFree(lock); 
     }; 
}; 
 
static mx_driver_attach mx_attach_detach;

void **mx_specials = 0;
void *mx_ctl[2];
mx_endpt_state_t **mx_endpts = 0;
int mx_max_minor = 0;
volatile SInt32 mx_macos_boards = 0;

char *mx_mapper_path = 0;

int
mx_start_mapper(mx_instance_state_t *is)
{
  kern_return_t ret;
  char *exec_path = "/opt/mx/etc/mx_macosx_start_mapper.sh";

  if (mx_mapper_path != NULL)
    exec_path = mx_mapper_path;
  MX_INFO(("Starting usermode helper: %s\n", exec_path));
  ret = KUNCExecute(exec_path, kOpenAppAsRoot, kOpenApplicationPath);
  if (ret != 0) {
    MX_INFO(("Problem starting usermode mapper, ret = %d\n", ret));
  }
  return ret;
}

int
mx_stop_mapper(mx_instance_state_t *is)
{
  return 0;
}


/*
 * This is our IO-Kit "Class".  We interact with IOKit as little as
 * possible..
 */


OSDefineMetaClassAndStructors (mx_driver, IOService);

/*
 * This is the IOKit "attach" routine.  Its called at module load time
 * and sets up the driver's class.
 */

bool mx_driver::start (IOService *provider)
{
  IOWorkLoop *myWorkLoop;
  int unit;
  unsigned int instance_number;
  uint64_t max_mem;

  IOLockLock(mx_attach_detach.lock);

  /*
   * cache our instance number;
   */
  instance_number = OSIncrementAtomic((SInt32 *)&mx_macos_boards);
  unit = instance_number;

  if (instance_number >= mx_max_instance) {
    MX_WARN(("Too many boards too attach all of them "
	     "only %d boards configured, increase mx_max_instance\n",
	     mx_max_instance));
    goto abort_with_instance_number;
  }

  if (unit == 0) {
#if MX_DEBUG
    PE_parse_boot_arg("mx.debug_mask", &mx_debug_mask);
#endif    
    PE_parse_boot_arg("mx.max_instance", &mx_max_instance);
    PE_parse_boot_arg("mx.small_message_threshold", 
		      &mx_small_message_threshold);
    PE_parse_boot_arg("mx.medium_message_threshold", 
		      &mx_medium_message_threshold);
    PE_parse_boot_arg("mx.security_disabled", &mx_security_disabled);
    PE_parse_boot_arg("mx.max_nodes", &mx_max_nodes);
    PE_parse_boot_arg("mx.intr_coal_delay", &mx_intr_coal_delay);
    PE_parse_boot_arg("mx.override_e_to_f", &mx_override_e_to_f);
    mx_pcie_down_on_error = 0;
    PE_parse_boot_arg("mx.pcie_down_on_error", &mx_pcie_down_on_error);
    mx_max_minor = mx_max_instance * 4 + mx_max_instance * mx_max_endpoints;
    if (mx_max_minor >= MX_CTL) {
      MX_NOTE(("mx_max_minor collides with MX_CTL\n"));
      goto abort_with_instance_number;
    }
    mx_specials = (void **)mx_kmalloc(mx_max_minor * sizeof(mx_specials[0]), 
				      MX_MZERO);
    mx_ctl[0] = mx_ctl[1] = 0;
    mx_endpts = (mx_endpt_state_t **)
      mx_kmalloc(mx_max_minor * sizeof(mx_endpts[0]), MX_MZERO);
    if (!mx_specials || !mx_endpts)
      goto abort_with_instance_number;
    if (mx_init_driver())
      goto abort_with_instance_number;

      /*
       * Create devfs entries
       */
    mx_major = mx_special_create();
    if (mx_major == -1) {
      goto abort_with_instance_number;
    }

/*
 * Limit MX to pinning 50% of physical memory 
 */

    mx_get_memory_size(&max_mem);
    mx_max_pinned_pages = (max_mem >> MX_PAGE_SHIFT) / 2;

    /* The IOMMU on G5s is limited to mapping 2GB of RAM.  We need
       to be very careful to never fill the IOMMU map, as the
       machine will simply panic at that point. So we limit MX to
       consuming 1.5 GB of map space. 
    */
#define IOMMU_LIMIT (1450*1024*1024)
    if (IOMapper::gSystem) {
      if ((unsigned)mx_max_pinned_pages > atop(IOMMU_LIMIT)) {
	MX_INFO(("Reduced pinnable memory to 1.5GB due to limited DART address space\n"));
	mx_max_pinned_pages = atop(IOMMU_LIMIT);
      }
    }
  }

  /* Start our superclass first. */
  if (IOService::start (provider) == false)
    goto abort_with_instance_number;

  /* Cache our provider to an instance variable. */
  pciNub = OSDynamicCast (IOPCIDevice, provider);
  if (!pciNub)
    goto abort_with_super;
  
  /* Retain provider, released in free(). */
  pciNub->retain ();

  /* Open our provider. */

  if (pciNub->open (this) == false)
    goto abort_with_provider;

#if 0
  /* We may not need/want to do this.  It seems to cause problems when
     the driver is loaded, then the machine is suspended, then the
     driver will not reload. ourDesiredPowerState changes from 2
     to 0, which is the source of the problem, I think */

  /* Request power.  If the machine was asleep before our driver was
     loaded, PCI config space may not be valid */
  if (pciNub->requestPowerDomainState(kIOPMPowerOn,
                                      (IOPowerConnection *)getParentEntry (gIOPowerPlane),
                                      IOPMLowestState) != IOPMNoErr) {
    MX_WARN(("Failed to power-on board\n"));
    goto abort_with_provider_open;
  }
#endif  

  /*
   * Create a "workloop" -- this is basically an interrupt thread
   * which is awakended for every hardware interrupt.
   */

  myWorkLoop = getWorkLoop ();
  if (!myWorkLoop) {
    MX_WARN (("MX:  Couldn't allocate workloop\n"));
    goto abort_with_provider_open;
  }
  
  interruptSource = IOInterruptEventSource::
    interruptEventSource ((OSObject *) this,
                          (IOInterruptEventAction) &mx_driver::interruptOccurred,
                          (IOService *) provider, (int) 0);

  if (!interruptSource) {
      MX_WARN(("MX:  Couldn't allocate Interrupt handler\n"));
      goto abort_with_workloop;
  }

  if (myWorkLoop->addEventSource (interruptSource) != kIOReturnSuccess) {
    MX_WARN(("Couldn't add Interrupt event source\n"));
    goto abort_with_interrupt_source;
  }

  if (unit == 0) {
    mx_start_timeout_thread();
  }

  /* map memory ( unmap ..magically.. taken care of by OS-X ) */
  pcimap = provider->mapDeviceMemoryWithIndex(0, kIOMapWriteCombineCache);
  if (pcimap == 0)
    {
      MX_WARN(("Couldn't map PCI memory\n"));
      goto abort_with_workloop_interrupt_source;
    }

  is = (mx_instance_state_t *)mx_kmalloc(sizeof(*is), MX_MZERO | MX_WAITOK);
  if (!is) {
    MX_WARN(("Failed to allocate is\n"));
    goto abort_with_map;
  }
  is->arch.csr = (caddr_t) pcimap->getVirtualAddress(); 
  if (is->arch.csr == 0) {
    MX_WARN(("Failed to get vaddr for board\n"));
    goto abort_with_map;
  }

  /* hack.  MacOSX does not export the size of selinfo.
     It is 24 in the kernel sources, so allocate 10x
     that much and hope it never bloats much more..*/
  is->arch.raw_si = (struct selinfo *) 
      mx_kmalloc(24 * 10, MX_MZERO|MX_WAITOK);
  if (is->arch.raw_si == NULL)
    goto abort_with_map;
  /* save our class so that we can call C++ IOKit functions from C */
  is->arch.cpp_class = (void *)this;


  /* enable interrupts from the OS's point of view*/
  interruptSource->enable ();
  getWorkLoop ()->enableAllInterrupts ();

  if (mx_instance_init(is, unit) != 0) {
    MX_NOTE (("mx_instance_init failed\n"));
    goto abort_with_is;
  }
  mx_mutex_exit(&is->sync);
  loaded_ok = 1;

  registerService();

  if (unit == 0) {
    sysctl_register_oid(&sysctl__net_mx);
    sysctl_register_oid(&sysctl__net_mx_max_pinned_pages);
    sysctl_register_oid(&sysctl__net_mx_pin_page_nospace);
  }
  IOLockUnlock(mx_attach_detach.lock);
  
  return true;

 abort_with_is:
  mx_kfree(is);

 abort_with_map:
  pcimap->release();

 abort_with_workloop_interrupt_source:
  getWorkLoop ()->removeEventSource (interruptSource);

 abort_with_interrupt_source:
  interruptSource->release ();

 abort_with_workloop:
  myWorkLoop->release ();

 abort_with_provider_open:
  pciNub->close (this);

 abort_with_provider:
  pciNub->release ();
  pciNub = 0;


 abort_with_super:
  IOService::stop (provider);

 abort_with_instance_number:
  instance_number = OSDecrementAtomic((SInt32 *)&mx_macos_boards);
  if (instance_number == 1) {
    mx_stop_timeout_thread();
    mx_finalize_driver();
    mx_cdevsw_remove();
    if (mx_major != -1) 
      mx_special_destroy();
    if (mx_specials)
      mx_kfree(mx_specials);
    mx_specials = 0;
    if (mx_endpts)
      mx_kfree(mx_endpts);
    mx_endpts = 0;
  }

  loaded_ok = 0;
  IOLockUnlock(mx_attach_detach.lock);
  return false;
}

/*
 * This is the module unload routine.  We cannot veto unload; the best
 * we can do is to sleep..
 */

void
mx_driver::free ()
{
  int instance_number, status;

  IOLockLock(mx_attach_detach.lock);
  if (loaded_ok == 0)
    /*
     *  if we have not been loaded OK, all teardown is done by the
     *  start function
     */
    goto done;

  instance_number = OSDecrementAtomic((SInt32 *)&mx_macos_boards);

  mx_cdevsw_remove();

  if (is != NULL && is->board_ops.disable_interrupt != NULL)
    is->board_ops.disable_interrupt(is);
  getWorkLoop ()->removeEventSource (interruptSource);
  interruptSource->release ();
  workLoop->release ();
  if (is != NULL && is->arch.raw_si != NULL)
    mx_kfree(is->arch.raw_si);
  status = mx_instance_finalize (is);
  while (status == EBUSY) {
    IOLog("Closing all endpoints on %d\n", is->id);
    mx_closeall();
    IOSleep(500);
    status = mx_instance_finalize (is);
  }

  pcimap->unmap();
  pcimap->release();
  pciNub->close (this);
  pciNub->release ();
  pciNub = 0;
  if (instance_number == 1) {
    sysctl_unregister_oid(&sysctl__net_mx_pin_page_nospace);
    sysctl_unregister_oid(&sysctl__net_mx_max_pinned_pages);
    sysctl_unregister_oid(&sysctl__net_mx);

    IOLog("instance number = %d: calling mx_finalize\n", instance_number);
    mx_stop_timeout_thread();
    mx_finalize_driver ();
    if (mx_major != -1)
      mx_special_destroy ();
    if (mx_specials)
      mx_kfree(mx_specials);
    mx_specials = 0;
    if (mx_endpts)
      mx_kfree(mx_endpts);
    mx_endpts = 0;
  } else {
    IOLog("instance number = %d: NOT calling mx_finalize\n", instance_number);	
  }

  mx_kfree(is);
 done:
  IOService::free ();               /* pass free to our superclass */
  IOLog("detached\n");
  IOLockUnlock(mx_attach_detach.lock);
}


bool
mx_driver::createWorkLoop()
{
  workLoop = IOWorkLoop::workLoop();

  return (workLoop != 0);
}

IOWorkLoop *
mx_driver::getWorkLoop()
{

  if (!workLoop)  {
      workLoop = IOWorkLoop::workLoop();
    }
  return workLoop;
}

/* 
 * XXXX power management!
 *
 * If we don't override joinPMtree(), and GM has previously been
 * loaded, then we loose our PCI device mapping (or power?!?)  when
 * the superclass' joinPMtree() is called from
 * IONetworkController::start() when the mx_ethernet driver attaches.
 * This causes us to loose contact with our device, and chaos ensues.
 */

void 
mx_driver::joinPMtree( IOService *driver)
{
//IOLog("mx_driver::joinPMtree() called\n");
}

extern "C" {

/* pining / unpining pages */

#if MX_DARWIN_XX >= 8

static inline IOMemoryDescriptor *
mx_alloc_md(task_t task, mx_uaddr_t addr, IOByteCount len, struct uio **uio_out)
{
  int status;
  IOMemoryDescriptor *md;
  IOOptionBits options;
  uio_t uio;

  uio = uio_create(1, 0, UIO_USERSPACE64, UIO_READ);
  if (uio == NULL) {
    IOLog("failed to create uio\n");    
    return NULL;
  }
  status = uio_addiov(uio, addr, len);
  if (status != 0) {
    IOLog("failed to create add iov\n");    
    md = NULL;
    goto abort_with_uio;
  }

  if (0) {
    user_addr_t a_baseaddr;
    user_size_t a_length;
    status = uio_getiov(uio, 0, &a_baseaddr, &a_length);
    IOLog("uio contains 1 seg at 0x%llx for %lld bytes\n", 
	(uint64_t)a_baseaddr, (uint64_t)a_length);
    
  }

  options = kIOMemoryTypeUIO | kIOMemoryAsReference | kIODirectionOutIn;
  md = IOMemoryDescriptor::withOptions(uio, 1, 0, task, options);
  if (md == NULL) {
    IOLog("failed to create iomd\n");
    goto abort_with_uio;
  }

  *uio_out = uio;
  return md;

 abort_with_uio:
  uio_free(uio);
  return md;
}

#endif  /* MX_DARWIN_XX >= 8 */

int
mx_pin_page(mx_instance_state_t *is, mx_page_pin_t *pin, int flags, uint64_t memory_context)
{
  mcp_dma_addr_t dma;
  return mx_pin_vpages(is, pin, &dma, 1, flags, memory_context);
}


void
mx_unpin_page(mx_instance_state_t *is, mx_page_pin_t *pin, int flags)
{
  mx_unpin_vpages(is, pin, 1, flags);
}

int
mx_pin_page_cluster(mx_instance_state_t *is, mx_page_pin_t *pins, 
		    int npages, int flags,
		    int is_64bit, uint64_t arch_private)
{
  IOMemoryDescriptor *md;
  IOByteCount offset, len, contig;
  uint64_t dma;
  IOReturn ret;
  vm_address_t addr;
  struct uio *uio = NULL;


  if (mx_max_pinned_pages < npages) {
    mx_pin_page_nospace++;
    return ENOSPC;
  }

  len = npages * MX_VPAGE_SIZE;
  
  /* create the memory descriptor */
  if (!is_64bit) {
    addr = (vm_address_t)pins[0].va;
    if (flags & MX_PIN_KERNEL) {
      md = IOMemoryDescriptor::
	withAddress((void *)addr, len, kIODirectionOutIn);
    } else {
      md = IOMemoryDescriptor::
	withAddress(addr, len, kIODirectionOutIn, current_task());
    }
  } else { 
#if MX_DARWIN_XX >= 8  
    /* use a special function to pin 64-bit memory */
    md = mx_alloc_md(current_task(), pins[0].va, len, &uio);
#else
    md = NULL;
#endif
  }

  if (!md)
    return ENXIO;

  /*
   * This involves paging in the memory, if necessary, and wiring it
   * down for the duration of the transfer.  The memory can be unwired
   * using md->complete().  This is totally atomic from the point of
   * view of our thread; no locks need or should be taken.
   */
  if ((ret = md->prepare(kIODirectionOutIn)) != kIOReturnSuccess) {
    /*IOLog("prepare returned %d\n", ret);*/
    goto abort_with_md;
  }

  offset = 0;
  while (offset < len) {
    if (IOMapper::gSystem)
      dma = (uint64_t)md->getPhysicalSegment(offset, &contig);
    else
      dma = md->getPhysicalSegment64(offset, &contig);
    do {
      pins[offset / MX_VPAGE_SIZE].dma.low = MX_LOWPART_TO_U32(dma);
      pins[offset / MX_VPAGE_SIZE].dma.high = MX_HIGHPART_TO_U32(dma);
      contig -= MX_VPAGE_SIZE;
      offset += MX_VPAGE_SIZE;
      dma += MX_VPAGE_SIZE;
    } while (contig);
  }

  pins[0].md = (void *) md;
  pins[0].uio = uio;
  OSAddAtomic(-1 * npages, &mx_max_pinned_pages);
  return 0;

 abort_with_md:
  md->release ();
#if MX_DARWIN_XX >= 8
  if (uio != NULL)
    uio_free(uio);
#endif

  return EFAULT;
}

/*
 *  The MacOSX IOMMU code tacks an extra "bogus" page onto every
 *  allocation, and divides its allocation zones into powers of two.
 *  So we make the most efficient use of space if we pin one
 *  page less than a power of two each time. 
 */

#define MX_PAGE_CLUSTER_CNT 127

int
mx_pin_vpages(mx_instance_state_t *is, mx_page_pin_t *pins, 
	      mcp_dma_addr_t   *mdesc, int npages, int flags, 
	      uint64_t arch_private)
{
  int i, error, pin_cnt;
  int is_64bit = 0;

  if (mx_max_pinned_pages < npages) {
    mx_pin_page_nospace++;
    return ENOSPC;
  }

#if MX_DARWIN_XX >= 8
  /* if we are pinning user memory, check to see if the
     process is using 64-bit addresses */
  if ((flags & MX_PIN_KERNEL) == 0) {
    is_64bit = proc_is64bit(current_proc());
  }
#endif

  for (i = 0; i < npages; i += pin_cnt) {
    int j;
    pin_cnt = ((npages - i) > MX_PAGE_CLUSTER_CNT) ? 
      MX_PAGE_CLUSTER_CNT : (npages - i);
    error = mx_pin_page_cluster(is, &pins[i],
				pin_cnt, flags, is_64bit, arch_private);
    if (error != 0) {
      goto abort;
    }
    for (j=0; j < pin_cnt; j++) {
      mdesc[i + j].low = htonl(pins[i + j].dma.low);
      mdesc[i + j].high = htonl(pins[i + j].dma.high);
    }
  }

  return 0;

 abort:
  if (i != 0)
    mx_unpin_vpages(is, pins, i, 0);
  return error;
}


void
mx_unpin_page_cluster(mx_instance_state_t *is, mx_page_pin_t *pin, int npages, int flags)
{
  IOMemoryDescriptor *md;

  md = (IOMemoryDescriptor *) pin->md;
  md->complete(kIODirectionOutIn);     /* unwire */
  md->release();                       /* free the decriptor */
#if MX_DARWIN_XX >= 8
  if (pin->uio != NULL)
    uio_free(pin->uio);
#endif
  OSAddAtomic(npages, &mx_max_pinned_pages);
}

void
mx_unpin_vpages(mx_instance_state_t *is, mx_page_pin_t *pins, int npages, int flags)
{
  int i, error, pin_cnt;

  for (i = 0; i < npages; i += pin_cnt) {
    pin_cnt = ((npages - i) > MX_PAGE_CLUSTER_CNT) ? 
      MX_PAGE_CLUSTER_CNT : (npages - i);
    mx_unpin_page_cluster(is, &pins[i], pin_cnt, flags);
  }
}
  
void
mx_munmapall(mx_endpt_state_t *es)
{
  struct mmapmd_entry *entry;
  IOMemoryDescriptor *md;
  IOMemoryMap *map;

  while (!SLIST_EMPTY(&es->arch.mmapmd)) {
    entry = SLIST_FIRST(&es->arch.mmapmd);
    SLIST_REMOVE_HEAD(&es->arch.mmapmd, entries);
    map = (IOMemoryMap *) entry->map;
    md = (IOMemoryDescriptor *) entry->md;
    map->unmap();
    map->release();
    md->release();
    mx_kfree(entry);
  }
}

int
mx_mmap(mx_endpt_state_t *es, mx_uaddr_t arg)
{
  mx_mmap_t i;
  int status;
  void *kva;
  task_t task;
  IOMemoryDescriptor *md;
  IOMemoryMap *user_map;
  mx_page_pin_t *dontcare;
  struct mmapmd_entry *entry;
  unsigned long off, pos, len;
  int mem_type;

  status = mx_arch_copyin(arg, &i, sizeof(i));
  if (status)
    goto abort_with_nothing;

  //  IOLog("mx_macosx_mmap: offset 0x%x\n", i.offset);
  //  IOLog("mx_macosx_mmap: len 0x%x\n", i.len);
  //  IOLog("mx_macosx_mmap: va 0x%x\n", i.va);

  off = i.offset;
  len = i.len;
  /* validate mapping */

  if (off & (PAGE_SIZE - 1)) {
    status = ENXIO;
    goto abort_with_nothing;
  }

  mx_mutex_enter(&es->sync);
  for (pos = 0; pos < len; pos += PAGE_SIZE) {
    /*
     * See if each page of the request is in range
     */
    
    status = mx_mmap_off_to_kva(es, off + pos, &kva, &mem_type, &dontcare);
    if (status != 0) {
      IOLog("status = %d, pos = 0x%lx, len = 0x%lx\n", status, pos, len);
      goto abort_with_mutex;
    }
  }
  

  /* map -- this routine returns a kernel virtual address */
  mx_mmap_off_to_kva(es, off, &kva, &mem_type, &dontcare);  /* can't fail -- it worked above! */

  entry = (struct mmapmd_entry *) mx_kmalloc(sizeof (*entry), MX_WAITOK);
  if (entry == NULL) {
    MX_WARN(("mx_kmalloc failed for md list entry\n"));
    status = ENXIO;
    goto abort_with_mutex;
  }	

  task = current_task();
  if (task == NULL) {
    MX_WARN(("mx_mmap: No task!?\n"));
    status = ENXIO;
    goto abort_with_entry;
  }	

  md = IOMemoryDescriptor::withAddress(kva, len, kIODirectionOutIn);
  if (!md) {
    MX_WARN(("IOMemoryDescriptor::withAddress failed for %p len  %d\n",
	     kva, i.len));
    status = ENXIO;
    goto abort_with_entry;
  }

  /* create a region i.len bytes long in this process's address space */
  user_map = md->map(task, (unsigned int) 0, kIOMapAnywhere, 0, len);
  if (!user_map) {
    MX_WARN(("IOMemoryDescriptor::withAddress failed for %p len  %d\n",
	     kva, i.len));
    status = ENXIO;
    goto abort_with_md;
  }

  i.va = (uint64_t)user_map->getVirtualAddress();

#if MX_CPU_x86||MX_CPU_x86_64
  /* hack!  MacOSX Intel 64-bit procs start their heap just above 4GB,
     and the getVirtualAddress() above is a 32-bit interface, so it
     truncates off the high bits.  Worse, IOMemoryDescriptor::map()
     cannot allocate addresses below 4GB if you specify an
     address. This is because the kernel keeps the address range below
     4GB unmapped in order to catch bugs..  All we can do is to assume
     that the address allocated will be between 4GB and 8GB and
     restore the lost bit.  Somebody needs to take Apple's "simple"
     32-bit kernel out and shoot it in head!
  */
  if (es->arch.is_64b && (i.va & (0xffffffffULL << 32)) == 0)
	  i.va |= (1ULL << 32);
#endif

  /* Tell the user where to find it */
  status = mx_arch_copyout(&i, arg, sizeof(i));
  if (status) {	
	  MX_WARN(("mx_macos_mmap: copyout failed -  addr 0x%llx\n", 
		  (unsigned long long)arg));
    goto abort_with_user_map;
  }

  /* save the map entry on the arch-specific endpt "entry" list.  It
   * will be freed when the endpt is closed or when mx_munmap is called.
   */
  entry->md = (void *) md;
  entry->map = (void *) user_map;
  SLIST_INSERT_HEAD(&es->arch.mmapmd, entry, entries);

  mx_mutex_exit(&es->sync);

  return 0;


 abort_with_user_map:
  user_map->release();

 abort_with_md:
  md->release();

 abort_with_entry:
  mx_kfree(entry);

 abort_with_mutex:
  mx_mutex_exit(&es->sync);

 abort_with_nothing:
  return status;
}

/* Get data from another process user space */
int
mx_arch_copy_from_task(mx_uaddr_t src, task_t src_task, int is_64bit,
		       void * dst,
		       uint32_t length)
{
  IOMemoryDescriptor *src_md;
  IOMemoryMap *src_map;
  struct uio *uio = NULL;
  IOVirtualAddress src_va;
  int status = 0;

  /* create memory descriptor describing the src address ranges */
  if (!is_64bit) {
    src_md = IOMemoryDescriptor::withAddress((vm_address_t)src, length,
					     kIODirectionOutIn, src_task);
  } else {
#if MX_DARWIN_XX >= 8
    src_md = mx_alloc_md(src_task, src, length, &uio);
#else
    src_md = NULL;
#endif
  }

  if (src_md == NULL) {
    status = ENOMEM;
    goto abort_with_nothing;
  }

  /* map the address range into the kernel */
  src_map = src_md->map();
  if (src_map == NULL) {
    status = EFAULT;
    goto abort_with_src_md;
  }

  /* find their kernel virtual addresses */
  src_va = src_map->getVirtualAddress();
  if (src_va == 0) {
    MX_WARN(("mx_arch_copy_from_task:NULL src_md!\n"));
    status = ENOSPC;
    goto abort_with_src_map;
  }

  /*  IOLog("copy from %p -> 0x%llx, %d bytes\n", (void *)src_va, 
      (unsigned long long)dst, length); */
  /* copy the bytes */
  memcpy(dst, (void *)src_va, length);

  /* tear down the mappings and the memory descriptor */

 abort_with_src_map:
  src_map->unmap();
  src_map->release();
 abort_with_src_md:
  src_md->release();
#if MX_DARWIN_XX >= 8
  if (uio != NULL)
    uio_free(uio);
#endif
 abort_with_nothing:
  return status;
}

/* 
 * MacOSX uses a 4GB + 4GB kernel, but we can't remap too much into
 * the kernel's address space.  The max that copyin/copyout support is
 * 256MB; beyond that, they return ENAMTOOLONG.  Set the chunk size to
 * something fairly small to be conservative
 *
 * Moreover, we need to avoid copying out across 256MB boundaries to
 * avoid bugs in MacOSX 10.4.  We have seen memory corruption when
 * doing this.
 */
#define MX_DIRECT_GET_CHUNK_SIZE (MX_PTOA(MX_PAGE_CLUSTER_CNT))

int
mx_direct_get_chunk(task_t src_task, mx_uaddr_t dst,
		    mx_uaddr_t src, uint32_t length, int is_64bit)
{
  IOMemoryDescriptor *src_md;
  IOMemoryMap *src_map;
  struct uio *uio = NULL;
  IOVirtualAddress src_va;
  int status = 0;

  /* create memory descriptor describing the src address ranges */
  if (!is_64bit) {
    src_md = IOMemoryDescriptor::withAddress((vm_address_t)src, length,
					     kIODirectionOutIn, src_task);
  } else {
#if MX_DARWIN_XX >= 8
    src_md = mx_alloc_md(src_task, src, length, &uio);
#else
    src_md = NULL;
#endif
  }

  if (src_md == NULL) {
    status = ENOMEM;
    goto abort_with_nothing;
  }

  /* map the address range into the kernel */
  src_map = src_md->map();
  if (src_map == NULL) {
    status = EFAULT;
    goto abort_with_src_md;
  }

  /* find their kernel virtual addresses */
  src_va = src_map->getVirtualAddress();
  if (src_va == 0) {
    MX_WARN(("mx_direct_get_chunk:NULL src_md!\n"));
    status = ENOSPC;
    goto abort_with_src_map;
  }

  /*  IOLog("copy from %p -> 0x%llx, %d bytes\n", (void *)src_va, 
      (unsigned long long)dst, length); */
  /* copy the bytes */
  status = mx_arch_copyout((void *)src_va, dst, length);

  if (status != 0) {
    MX_WARN(("mx_direct_get_chunk: copyout to 0x%llx fails with %d\n", 
	     (unsigned long long)dst, status));
  }

  /* tear down the mappings and the memory descriptor */

 abort_with_src_map:
  src_map->unmap();
  src_map->release();
 abort_with_src_md:
  src_md->release();
#if MX_DARWIN_XX >= 8
  if (uio != NULL)
    uio_free(uio);
#endif
 abort_with_nothing:
  return status;
}

/* OS specific callback for direct get, copying from another process
 * user-space to current process user-space.
 */

struct direct_get_callback_param {
  task_t src_task;
  int is_64bit;
};

int
mx_arch_copy_user_to_user(mx_uaddr_t dst,
			  mx_uaddr_t src, void * src_space,
			  uint32_t length)
{
  struct direct_get_callback_param * param = (struct direct_get_callback_param *) src_space;
  task_t src_task = param->src_task;
  int is_64bit = param->is_64bit;
  uint32_t chunk_len;
  mx_uaddr_t seg256_end;
  int status = 0;

  while ((length != 0) && status == 0) {
    chunk_len = (length > MX_DIRECT_GET_CHUNK_SIZE) ? 
      MX_DIRECT_GET_CHUNK_SIZE : length;
    seg256_end = (dst + 0x10000000) & ~((mx_uaddr_t)(0x10000000 - 1));
    if (chunk_len + dst > seg256_end)
      chunk_len = seg256_end - dst;
    status = mx_direct_get_chunk(src_task, dst, src, chunk_len, is_64bit);
    dst += chunk_len;
    src += chunk_len;
    length -= chunk_len;
  } 

  return status;
}

int
mx_direct_get(mx_endpt_state_t *dst_es, mx_shm_seg_t *dst_segs, uint32_t dst_nsegs,
	      mx_endpt_state_t *src_es, mx_shm_seg_t *src_segs, uint32_t src_nsegs,
	      uint32_t length)
{
  struct direct_get_callback_param param;
  task_t src_task, dst_task;
  int status = EINVAL;

  src_task = src_es->arch.task;
  dst_task = dst_es->arch.task;

  if (src_task == NULL || dst_task == NULL)
    goto abort_with_nothing;

  if (current_task() != dst_task) {
    MX_WARN(("mx_direct_get() to non curproc?"));
    goto abort_with_nothing;
  }

  /* get destination segments from current process */
  if (dst_nsegs > 1) {
    mx_uaddr_t uptr = dst_segs[0].vaddr;
    dst_segs = (mx_shm_seg_t *) mx_kmalloc(dst_nsegs * sizeof(*dst_segs), 0);
    if (!dst_segs) {
      status = ENOMEM;
      goto abort_with_nothing;
    }
    status = mx_arch_copyin(uptr, dst_segs, dst_nsegs * sizeof(*dst_segs));
    if (status) {
      goto abort_with_dst_segs;
    }
  }

  param.src_task = src_task;
  param.is_64bit = 0;
#if MX_DARWIN_XX >= 8
  param.is_64bit = proc_is64bit(current_proc());
#endif

  /* get destination segments from current process */
  if (src_nsegs > 1) {
    mx_uaddr_t uptr = src_segs[0].vaddr;
    src_segs = (mx_shm_seg_t *) mx_kmalloc(src_nsegs * sizeof(*src_segs), 0);
    if (!src_segs) {
      status = ENOMEM;
      goto abort_with_dst_segs;
    }
    status = mx_arch_copy_from_task(uptr, src_task, param.is_64bit, src_segs,
				    src_nsegs * sizeof(*src_segs));
    if (status) {
      goto abort_with_src_segs;
    }
  }

  status = mx_direct_get_common(dst_segs, dst_nsegs,
				&param, src_segs, src_nsegs,
				length);

 abort_with_src_segs:
  if (src_nsegs > 1)
    mx_kfree(src_segs);
 abort_with_dst_segs:
  if (dst_nsegs > 1)
    mx_kfree(dst_segs);
 abort_with_nothing:
  return status;
}

/****************************************************************
 * PCI config space functions
 ****************************************************************/
#define pcibios_to_mx_read(size)                                           \
int                                                                        \
mx_read_pci_config_##size (mx_instance_state_t *is,                        \
			   uint32_t offset, uint##size##_t *value)         \
{                                                                          \
  mx_driver *cpp_class = (mx_driver *)is->arch.cpp_class;                  \
  *value =  cpp_class->pciNub->configRead##size (offset);                  \
  return (0);                                                              \
}
pcibios_to_mx_read(32)
pcibios_to_mx_read(16)
pcibios_to_mx_read(8)


#define pcibios_to_mx_write(size)                                          \
int                                                                        \
mx_write_pci_config_##size (mx_instance_state_t *is,                       \
			    uint32_t offset, uint##size##_t value)         \
{                                                                          \
  mx_driver *cpp_class = (mx_driver *)is->arch.cpp_class;                  \
  cpp_class->pciNub->configWrite##size (offset, value);                    \
  return (0);                                                              \
}
pcibios_to_mx_write(32)
pcibios_to_mx_write(16)
pcibios_to_mx_write(8)

}
